# Run the following commands in terminal: cd ~ # navigate to your home directory mkdir -p ChIP-seq/{data,results,scripts,software} # create directories to store data and result files cd ChIP-seq/data/ # move to your data folder # create a text file to store the SRRids of interest (called list.txt here) cat list.txt | parallel fastq-dump --split-files {} # download datasets, separating paired reads with split-files## optional method to download all NANOG ChIP-Seq files in project PRJNA261253 esearch -db sra -query PRJNA261253 | esummary > summary.xml cat summary.xml | xtract -pattern DocumentSummary -element Run@acc Title > summary.txt cat summary.txt | grep NANOG | cut -f 1 > runids.txt cat runids.txt | parallel fastq-dump --split-files {} # rename files for easier analysis ( we are only using 1 of the paired read files for each run for this demo) mv SRR1576335_1.fastq dME_Nanog.fastq mv SRR1576337_1.fastq HUES64_rep1_Nanog.fastq mv SRR1576338_1.fastq HUES64_rep2_Nanog.fastq mv SRR1576340_1.fastq dEN_Nanog.fastq mv SRR1576341_1.fastq dEC_Nanog.fastq
# now get or build the hg19 reference human genome # Option 1 - get bowtie1 pre-built hg19 from https://bowtie-bio.sourceforge.net/index.shtml # Option 2 - build and index it yourself # download individual files and chromosome size data URL=http://hgdownload.cse.ucsc.edu/goldenpath/hg19/bigZips/chromFa.tar.gz curl $URL | tar zxv curl http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/hg19.chrom.sizes > refs/hg19.chrom.sizes # concatenate all chromosomes together mv your_folder/*.fa refs/ REF=refs/hg19.fa cat refs/chr*.fa > $REF # on Quest load bowtie 1 and build your reference genome with it module load bowtie/1.2.2 # for short reads < 50bp bowtie-build $REF bowtie_hg19
# now align chip-seq reads to indexed genome to create SAM files # create names.txt to hold sample names -- dME_Nanog etc BASENAME=bowtie_hg19 cat names.txt | parallel bowtie -p 10 --best --chunkmbs 320 $BASENAME -q data/{}.fastq -S results/bams/{}.sam# convert sam to sorted bam files cat names.txt | parallel samtools sort results/bams/{}.sam -o results/bams/{}.bam # index bam files to get .bam.bai files cat names.txt | parallel samtools index results/bams/{}.bam # optional remove sam files since they are large and no longer needed rm results/bams/*sam
# load deeptools on Quest module load deeptools/3.5.1 # create a file called "names.txt" which contains all the experiment file names. In this case names.txt looks like this- HUES64_rep1_Nanog HUES64_rep2_Nanog dEC_Nanog dEN_Nanog dME_Nanog # to get bedgraph files which are human readable cat names.txt | parallel bamCoverage --bam results/bams/{}.bam -o results/bams/{}.SeqDepthNorm.bw --outFileFormat bedgraph # to get bigWig files for graphing later cat names.txt | parallel bamCoverage -b results/bams/{}.bam --normalizeUsing RPKM --binSize 30 --smoothLength 300 -p 10 --extendReads 200 -o results/bams/{}.bw# run multiBigwigSummary # The default output of multiBamSummary (a compressed numpy array: `*.npz`) can be visualized using plotCorrelation or plotPCA multiBigwigSummary bins -b \ results/bams/HUES64_rep1_Nanog.bw \ results/bams/HUES64_rep2_Nanog.bw \ results/bams/dEC_Nanog.bw results/bams/dEN_Nanog.bw \ results/bams/dME_Nanog.bw \ -o multibigwigsummary.npzplotPCA -in multibigwigsummary.npz \ -o plotPCAoutput.png \ -T "PCA of read counts"
# get the gencode annotation file for hg19 reference genome and compute TSS BED file from it wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_19/gencode.v19.annotation.gtf.gz zcat gencode.v19.annotation.gtf.gz | awk 'OFS="\t" {if ($3=="transcript") {if ($7 == "+") {print $1,$4-1,$4,$12,".",$7} else {print $1,$5-1,$5,$12,".",$7}}}' | tr -d '";' | sort -k1,1V -k2,2n > gencode.v19.annotation.tss.bed# compute Matrix with output outFileSortedRegions in BED format to display in an annotation track computeMatrix reference-point --referencePoint TSS \ -b 1000 -a 1000 \ -R gencode.v19.annotation.tss.bed \ -S results/bams/*Nanog.bw \ --skipZeros \ -o results/matrixNanog_TSS.gz \ -p 6 \ --outFileSortedRegions results/regions_TSS.bed
# compute Matrix with output matrix needed for plotting profiles and heatmaps computeMatrix reference-point --referencePoint TSS \ -b 1000 -a 1000 \ -S results/bams/dEC_Nanog.bw \ -R gencode.v19.annotation.tss.bed \ --skipZeros \ --numberOfProcessors 16 \ -o dEC_Nanog_matrix.mat.gz# plot profile plotProfile -m dEC_Nanog_matrix.mat.gz -out plotProfile_dEC_Nanog.png
# plot heatmap plotHeatmap -m dEC_Nanog_matrix.mat.gz -out plotHeatmap_dEC_Nanog.png
computeMatrix reference-point --referencePoint TSS \ -b 1000 -a 1000 \ -S results/bams/*Nanog.bw \ -R gencode.v19.annotation.tss.bed \ --skipZeros --numberOfProcessors 16 \ -o Nanog_matrix.mat.gz# plot heatmap plotHeatmap -m Nanog_matrix.mat.gz -out plotNanogHeatmap.png
# KMeans clustering on the heatmap plotHeatmap -m Nanog_matrix.mat.gz \ -out plotNanogHeatmapKMeans.png \ --colorMap RdBu \ --whatToShow 'heatmap and colorbar' \ --kmeans 4
plotHeatmap -m Nanog_matrix.mat.gz \ -out plotNanogHeatmapKMeans_colorlist.png \ --colorMap Oranges_r Oranges_r Blues_r PiYG viridis \ --whatToShow 'heatmap and colorbar' \ --kmeans 4 \ --boxAroundHeatmaps no